---
title: "Data sets structure - MicroMic"
author: "Tania Fort"
date: "15/02/2021"
output: html_document
editor_options: 
  chunk_output_type: console
---

```{r libraries}
# Libraries 
library(dada2)
library(ShortRead)
library(phyloseq)
library(ggplot2)
library(reshape2)
library(dplyr)
library(wesanderson)
library(tidyr)
library(purrr)
library(tibble)
library(ggbeeswarm)
library(psadd)
```

Set parameters
```{r set_parameters}
# Set paths
setwd("~/These/Projects")
outp.fig <-"MicroMic/Soumission_AFS/Files/Figures"
outp.tab <-"MicroMic/Soumission_AFS/Files/Tables"
```

Data structure by Run 
# Run 1 16S 
```{r run1, eval=FALSE}
############################
#                          #
#      Track sequences     #
#                          #
############################

############################
# Import data 
############################

# Import all data from each step of the pipeline 
raw <- countFastq("MicroMic/R_code/Pipelines/raw_data/demultiplexed_reads_16S_Run1", pattern = "fwd.fastq")
rownames(raw) <- gsub("_fwd.fastq","",rownames(raw)) 
raw$smp_names <- rownames(raw) 
out <- read.csv("MicroMic/R_code/Pipelines/output/Run1_16S/cutadapt_and_filtered/quality.csv", row.names = 1)
rownames(out) <- gsub("_rev.fastq","",rownames(out)) 
# Merge datasets
tabcounts <- merge(raw[,c("records","smp_names")], out, by = 0, all = T) 
rownames(tabcounts) <- tabcounts$smp_names 
tabcounts$Row.names <- NULL
tabcounts$smp_names <- NULL
colnames(tabcounts) <-  c("raw","cutadapt","filtered") 
  
pathR1 <- "MicroMic/R_code/Pipelines/output/Run1_16S/output/"
dadaFs <-  readRDS(file.path(pathR1,"dadaFs.RDS"))
dadaRs <- readRDS(file.path(pathR1,"dadaRs.RDS"))
mergers  <- readRDS(file.path(pathR1,"mergers.RDS"))
seqtab.nochim  <- readRDS(file.path(pathR1,"seqtab.no.chimera.RDS"))
collapse <- readRDS(file.path(pathR1,"seqtab.coll.RDS"))
dada2 <-  readRDS(file.path(pathR1,"ps.dada2.RDS"))
microbial <-  readRDS(file.path(pathR1,"ps.microbial.RDS"))
galan <- readRDS(file.path(pathR1,"ps.galan.RDS"))
decon <- readRDS(file.path(pathR1,"ps.decon.RDS"))
R1 <- readRDS(file.path(pathR1,"ps.vf.Run1.16S.RDS"))

############################
# Percentage of reads lost 
############################

# Get number of reads from RDS objects throughout the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim), rowSums(collapse))
track <- merge(tabcounts,track,  by = 0, all = T)

trackps <- Reduce(function(x,y) merge(x = x, y = y, by ="names", all = T), 
       list(cbind(names = sample_names(dada2),dada2 = sample_sums(dada2)), 
            cbind(names = sample_names(microbial),microbial = sample_sums(microbial)), 
            cbind(names = sample_names(galan),galan = sample_sums(galan)), 
            cbind(names = sample_names(decon),decon = sample_sums(decon)),  
            cbind(names = sample_names(R1),vf = sample_sums(R1))))
trackall <- merge(track, trackps, by= 1)
colnames(trackall) <- c("Row.names","input","cutadapt","filtered", "denoisedF", "denoisedR", "merged", "nonchim","collapse",
                        "dada2","microbial","galan","decon","vf")
rownames(trackall) <- trackall$Row.names 
trackall$Row.names  <- NULL 
trackall <- trackall %>% mutate_if(is.character,as.numeric)
trackall[is.na(trackall)] <- 0
write.csv(trackall,file.path(outp.tab, "track.R1.csv"))

# Table with percentage of sequences lost 
track.tab <- data.frame(pipeline_step = c("cutadapt","DADA2 filter","No chimera", "Microbial", "Galan", "Decon","Vf"), 
  nb_reads = c((sum(trackall$input) - sum(trackall$cutadapt) ) / sum(trackall$input),
               (sum(trackall$cutadapt) - sum(trackall$filtered)) / sum(trackall$input), 
               (sum(trackall$filtered) - sum(trackall$nonchim) ) / sum(trackall$input),
               (sum(trackall$nonchim) - sum(trackall$microbial) ) / sum(trackall$input),
               (sum(trackall$microbial) - sum(trackall$galan) ) / sum(trackall$input),
               (sum(trackall$galan) - sum(trackall$decon) ) / sum(trackall$input),
               (sum(trackall$decon) - sum(trackall$vf) ) / sum(trackall$input)))
track.tab$percentage <- round(track.tab$nb_reads*100, 2)
sum(track.tab$percentage)
R1sum <- track.tab[,c("pipeline_step","percentage")]
write.csv(R1sum,file.path(outp.tab, "track.R1.percentage.csv"))  

# Plot with percentage of sequences lost 
trackall$smp_names <- rownames(trackall) 
tracktoplot <- trackall[,c("smp_names","input","cutadapt","filtered", "nonchim","microbial","galan","decon","vf")]
mlt.track <- melt(tracktoplot, id.vars = "smp_names")

# Format labels on the y axis
scientific_10 <- function(x) {
  parse(text=gsub("e", " %*% 10^", scales::scientific_format()(x)))
}

(track.seq.plot <- ggplot(mlt.track, aes(x = variable, y = value, color = variable, fill = variable)) + geom_violin(trim = FALSE) + 
  geom_line(aes(group = smp_names),colour = "grey", alpha = 0.3) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) + 
  scale_color_manual(values  =  pal) +
  scale_fill_manual(values  =  pal) +
  scale_x_discrete(name = "Pipeline steps") +
  scale_y_continuous(name = "Number of reads", label=scientific_10) )
ggsave(file.path(outp.fig,"track.seq.R1.svg"),track.seq.plot, width = 7, height = 4, dpi = 300)

############################
#                          #
#      Data structure      #
#                          #
############################

############################
# Number of reads in sample
############################

# Create a dataframe with all information
(R1sum_sup <- data.frame(
  Variable = c("Minimum reads","Mean reads","Median reads","Maximum reads","Total reads","Number of Samples", "Number of ASVs"),
  All.samples = c(min(sample_sums(R1)),mean(sample_sums(R1)),
                  median(sample_sums(R1)),max(sample_sums(R1)),sum(sample_sums(R1)),nsamples(R1),ntaxa(R1) ) ))

#Plot the distribution of sample sequencing depth
sample_sum_df <- data.frame(sum = sample_sums(R1))
ggplot(sample_sum_df, aes(x = sum)) +
     geom_histogram(color = "black", fill = "indianred", binwidth = 1000) +
     # scale_x_continuous(breaks = c(50, 10000, 20000, 30000, 40000, 50000))  + 
     ggtitle("Distribution of sample sequencing depth") + 
     xlab("Read counts") + 
     theme(axis.title.y = element_blank())

#Top 100 samples with few sequences 
sort(sample_sums(R1))[1:100]

#Plot number of sequences in each sample  
plot(sort(sample_sums(R1)))

############################
# Number of samples
############################

#Cross tabulation 
smp <- as(sample_data(R1), "data.frame")
df_tab_cross<-as.data.frame(table(smp$Tree,smp$Position_branch,smp$Date))


# Plot cross tabulation 
Nb.sample <- ggplot(df_tab_cross, aes(x=Var1, y=Var2)) + geom_point(aes(size=Freq, color=Freq)) +
   geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("") + 
   ggtitle("Number of samples in each tree")
ggsave(file.path(outp.fig,"Nb.sample.R1.svg"),Nb.sample, width = 5, height = 4, dpi = 300)

############################
# Number of taxa 
############################

# Table with percentage of ASVs lost during bioinformatic pipeline 
track.sp.R1 <- data.frame(pipeline_step = c("No chimera", "Microbial", "Galan", "Decon"), 
  nb_sp = c(ntaxa(dada2)-ntaxa(microbial),
            ntaxa(microbial)-ntaxa(galan),
            ntaxa(galan)-ntaxa(decon),
            ntaxa(decon)-ntaxa(R1)),
  percentage_sp = c((ntaxa(dada2)-ntaxa(microbial)) /ntaxa(dada2)*100,
            (ntaxa(microbial)-ntaxa(galan))/ntaxa(dada2)*100,
            (ntaxa(galan)-ntaxa(decon))/ntaxa(dada2)*100,
            (ntaxa(decon)-ntaxa(R1))/ntaxa(dada2)*100) ) 


# Plot number of taxa by rank  

# Script by Charlie Pauvert 
# Get a list of phyloseq objects, agglomerated to each taxonomic rank. Note, you should consider using or comparing to NArm = FALSE, especially if you have many taxa with unassigned lower ranks (e.g. Species). 
ps_list <- rank_names(R1) %>%
  map(~tax_glom(R1, ., NArm = TRUE))
names(ps_list) <- rank_names(R1)

# Get the richness for each sample at each rank. I'm just going to calculate this by hand.
richness <- ps_list %>%
    # convert otu_table to presence/absence
    map(transform_sample_counts, function(x) 1 * (x > 0)) %>%
    # Sample sum will now be the number of present taxa
    map(sample_sums)
richness
# Take the set of richness at each rank in the list richness and combine them into a data frame. Then join the sample data into this table.
tbl <- richness %>%
    map(enframe, "Sample", "Richness") %>%
    bind_rows(.id = "Rank")
sam <- sample_data(R1) %>% as_tibble(rownames = "Sample")
tbl <- left_join(tbl, sam, by = "Sample")
# If we are plotting ranks on the x-axis, we should make Rank a factor so we can order appropriately
tbl <- tbl %>%
    mutate(Rank = factor(Rank, rank_names(R1)))

(taxo.RS <- ggplot(tbl, aes(Rank, Richness)) +
    # ggbeeswarm::geom_quasirandom(color = "#39ab33") +    
  geom_boxplot() +
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
  xlab("Taxonomic rank") + ylab("ASV richness"))
ggsave(file.path(outp.fig,"taxo.RS.R1.svg"),taxo.RS, width = 5, height = 4, dpi = 300)
```

# Run 2 16S 
```{r run2, eval=FALSE}
############################
#                          #
#      Track sequences     #
#                          #
############################

############################
# Import data 
############################

# Import all data from each step of the pipeline 
raw <- countFastq("MicroMic/R_code/Pipelines/raw_data/demultiplexed_reads_16S_Run2", pattern = "fwd.fastq")
rownames(raw) <- gsub("_fwd.fastq","",rownames(raw)) 
raw$smp_names <- rownames(raw) 
out <- read.csv("MicroMic/R_code/Pipelines/output/Run2_16S/cutadapt_and_filtered/quality.csv", row.names = 1)
rownames(out) <- gsub("_rev.fastq","",rownames(out)) 
# Merge datasets
tabcounts <- merge(raw[,c("records","smp_names")], out, by = 0, all = T) 
rownames(tabcounts) <- tabcounts$smp_names 
tabcounts$Row.names <- NULL
tabcounts$smp_names <- NULL
colnames(tabcounts) <-  c("raw","cutadapt","filtered") 
  
pathR2 <- "MicroMic/R_code/Pipelines/output/Run2_16S/output/"
dadaFs <-  readRDS(file.path(pathR2,"dadaFs.RDS"))
dadaRs <- readRDS(file.path(pathR2,"dadaRs.RDS"))
mergers  <- readRDS(file.path(pathR2,"mergers.RDS"))
seqtab.nochim  <- readRDS(file.path(pathR2,"seqtab.no.chimera.RDS"))
collapse <- readRDS(file.path(pathR2,"seqtab.coll.RDS"))
dada2 <-  readRDS(file.path(pathR2,"ps.dada2.RDS"))
microbial <-  readRDS(file.path(pathR2,"ps.microbial.RDS"))
galan <- readRDS(file.path(pathR2,"ps.galan.RDS"))
decon <- readRDS(file.path(pathR2,"ps.decon.RDS"))
R2 <- readRDS(file.path(pathR2,"ps.vf.Run2.16S.RDS"))
 
############################
# Percentage of reads lost 
############################

# Get number of reads from RDS objects throughout the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim), rowSums(collapse))
track <- merge(tabcounts,track,  by = 0, all = T)

trackps <- Reduce(function(x,y) merge(x = x, y = y, by ="names", all = T), 
       list(cbind(names = sample_names(dada2),dada2 = sample_sums(dada2)), 
            cbind(names = sample_names(microbial),microbial = sample_sums(microbial)), 
            cbind(names = sample_names(galan),galan = sample_sums(galan)), 
            cbind(names = sample_names(decon),decon = sample_sums(decon)),  
            cbind(names = sample_names(R2),vf = sample_sums(R2))))
trackall <- merge(track, trackps, by= 1)
colnames(trackall) <- c("Row.names","input","cutadapt","filtered", "denoisedF", "denoisedR", "merged", "nonchim","collapse",
                        "dada2","microbial","galan","decon","vf")
rownames(trackall) <- trackall$Row.names 
trackall$Row.names  <- NULL 
trackall <- trackall %>% mutate_if(is.character,as.numeric)
trackall[is.na(trackall)] <- 0
write.csv(trackall,file.path(outp.tab, "track.R2.csv"))

# Table with percentage of sequences lost 
track.tab <- data.frame(pipeline_step = c("cutadapt","DADA2 filter","No chimera", "Microbial", "Galan", "Decon","Vf"), 
  nb_reads = c((sum(trackall$input) - sum(trackall$cutadapt) ) / sum(trackall$input),
               (sum(trackall$cutadapt) - sum(trackall$filtered)) / sum(trackall$input), 
               (sum(trackall$filtered) - sum(trackall$nonchim) ) / sum(trackall$input),
               (sum(trackall$nonchim) - sum(trackall$microbial) ) / sum(trackall$input),
               (sum(trackall$microbial) - sum(trackall$galan) ) / sum(trackall$input),
               (sum(trackall$galan) - sum(trackall$decon) ) / sum(trackall$input),
               (sum(trackall$decon) - sum(trackall$vf) ) / sum(trackall$input)))
track.tab$percentage <- round(track.tab$nb_reads*100, 2)
sum(track.tab$percentage)
R2sum <- track.tab[,c("pipeline_step","percentage")]
write.csv(R2sum,file.path(outp.tab, "track.R2.percentage.csv"))  

# Plot with percentage of sequences lost 
trackall$smp_names <- rownames(trackall) 
tracktoplot <- trackall[,c("smp_names","input","cutadapt","filtered", "nonchim","microbial","galan","decon","vf")]
mlt.track <- melt(tracktoplot, id.vars = "smp_names")

# Format labels on the y axis
scientific_10 <- function(x) {
  parse(text=gsub("e", " %*% 10^", scales::scientific_format()(x)))
}

(track.seq.plot <- ggplot(mlt.track, aes(x = variable, y = value, color = variable, fill = variable)) + geom_violin(trim = FALSE) + 
  geom_line(aes(group = smp_names),colour = "grey", alpha = 0.3) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) + 
  scale_color_manual(values  =  pal) +
  scale_fill_manual(values  =  pal) +
  scale_x_discrete(name = "Pipeline steps") +
  scale_y_continuous(name = "Number of reads", label=scientific_10) )
ggsave(file.path(outp.fig,"track.seq.R2.svg"),track.seq.plot, width = 7, height = 4, dpi = 300)

############################
#                          #
#      Data structure      #
#                          #
############################

############################
# Number of reads in sample
############################

# Separate the different sets 
R2_set2 <- subset_samples_no_zero(R2, Sample_type =="Leaf_piece")
R2_set3 <- subset_samples_no_zero(R2, Sample_type =="Leaf_tissu")
R2_set4 <- subset_samples_no_zero(R2, Sample_type =="Litter")
R2_set5 <- subset_samples_no_zero(R2, Sample_type =="Source")

# Create a dataframe with all information
(R2sum_sup <- data.frame(
  Variable = c("Minimum reads","Mean reads","Median reads","Maximum reads","Total reads","Number of Samples", "Number of ASVs"),
  All.samples = c(min(sample_sums(R2)),mean(sample_sums(R2)),
                  median(sample_sums(R2)),max(sample_sums(R2)),sum(sample_sums(R2)),nsamples(R2),ntaxa(R2)),
  Set2 = c(min(sample_sums(R2_set2)),mean(sample_sums(R2_set2)),
                  median(sample_sums(R2_set2)),max(sample_sums(R2_set2)),sum(sample_sums(R2_set2)),nsamples(R2_set2),ntaxa(R2_set2)),
  Set3 = c(min(sample_sums(R2_set3)),mean(sample_sums(R2_set3)),
                  median(sample_sums(R2_set3)),max(sample_sums(R2_set3)),sum(sample_sums(R2_set3)),nsamples(R2_set3),ntaxa(R2_set3)),
  Set4 = c(min(sample_sums(R2_set4)),mean(sample_sums(R2_set4)),
                  median(sample_sums(R2_set4)),max(sample_sums(R2_set4)),sum(sample_sums(R2_set4)),nsamples(R2_set4),ntaxa(R2_set4)),
  Set5 = c(min(sample_sums(R2_set5)),mean(sample_sums(R2_set5)),
                  median(sample_sums(R2_set5)),max(sample_sums(R2_set5)),sum(sample_sums(R2_set5)),nsamples(R2_set5),ntaxa(R2_set5))))

#Plot the distribution of sample sequencing depth
sample_sum_df <- data.frame(sum = sample_sums(R2))
ggplot(sample_sum_df, aes(x = sum)) +
     geom_histogram(color = "black", fill = "indianred", binwidth = 1000) +
     # scale_x_continuous(breaks = c(50, 10000, 20000, 30000, 40000, 50000))  + 
     ggtitle("Distribution of sample sequencing depth") + 
     xlab("Read counts") + 
     theme(axis.title.y = element_blank())

#Top 100 samples with few sequences 
sort(sample_sums(R2))[1:100]

#Plot number of sequences in each sample  
plot(sort(sample_sums(R2)))

############################
# Number of samples
############################

#Cross tabulation 
smp <- as(sample_data(R2), "data.frame")

# Leaf tissue 
smp.ltissue <- subset(smp, Sample_type =="Leaf_tissu")
(df_tab_cross_ltissue <- as.data.frame(table(smp.ltissue$Tissue_type, smp.ltissue$Position)))
ggplot(df_tab_cross_ltissue, aes(x=Var2, y=Var1)) + geom_point(aes(size=Freq, color=Freq)) +
  geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   # facet_grid(.~Var2+Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("")

# Leaf piece 
smp.lpiece <- subset(smp, Sample_type =="Leaf_piece")
(df_tab_cross_lpiece <- as.data.frame(table(smp.lpiece$Tissue_type, smp.lpiece$Date ,smp.lpiece$Tree,smp.lpiece$Position )))
ggplot(df_tab_cross_lpiece, aes(x=Var4, y=Var1)) + geom_point(aes(size=Freq, color=Freq)) +
  geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var2+Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("") 

# Litter 
smp.litter <- subset(smp, Sample_type =="Litter")
(df_tab_cross_litter <- as.data.frame(table(smp.litter$Date, smp.litter$Tree, smp.litter$Position)))
ggplot(df_tab_cross_litter, aes(x=Var3, y=Var1)) + geom_point(aes(size=Freq, color=Freq)) +
  geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var2, scales = "free") +
   theme_bw() + xlab("") + ylab("") 

# Sources 
smp.source <- subset(smp, Sample_type =="Source")
(df_tab_cross_source <-as.data.frame(table(smp.source$Tissue_type)))

Nb.sample <- ggplot(df_tab_cross, aes(x=Var1, y=Var2)) + geom_point(aes(size=Freq, color=Freq)) +
   geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("") + 
   ggtitle("Number of samples in each tree")
ggsave(file.path(outp.fig,"Nb.sample.R2.svg"),Nb.sample, width = 5, height = 4, dpi = 300)


############################
# Number of taxa 
############################

# Table with percentage of ASVs lost during bioinformatic pipeline 
track.sp.R1 <- data.frame(pipeline_step = c("No chimera", "Microbial", "Galan", "Decon"), 
  nb_sp = c(ntaxa(dada2)-ntaxa(microbial),
            ntaxa(microbial)-ntaxa(galan),
            ntaxa(galan)-ntaxa(decon),
            ntaxa(decon)-ntaxa(R2)),
  percentage_sp = c((ntaxa(dada2)-ntaxa(microbial)) /ntaxa(dada2)*100,
            (ntaxa(microbial)-ntaxa(galan))/ntaxa(dada2)*100,
            (ntaxa(galan)-ntaxa(decon))/ntaxa(dada2)*100,
            (ntaxa(decon)-ntaxa(R2))/ntaxa(dada2)*100) ) 


# Plot number of taxa by rank  

# Script by Charlie Pauvert 
# Get a list of phyloseq objects, agglomerated to each taxonomic rank. Note, you should consider using or comparing to NArm = FALSE, especially if you have many taxa with unassigned lower ranks (e.g. Species). 
ps_list <- rank_names(R2) %>%
  map(~tax_glom(R2, ., NArm = TRUE))
names(ps_list) <- rank_names(R2)

# Get the richness for each sample at each rank. I'm just going to calculate this by hand.
richness <- ps_list %>%
    # convert otu_table to presence/absence
    map(transform_sample_counts, function(x) 1 * (x > 0)) %>%
    # Sample sum will now be the number of present taxa
    map(sample_sums)
richness
# Take the set of richness at each rank in the list richness and combine them into a data frame. Then join the sample data into this table.
tbl <- richness %>%
    map(enframe, "Sample", "Richness") %>%
    bind_rows(.id = "Rank")
sam <- sample_data(R2) %>% as_tibble(rownames = "Sample")
tbl <- left_join(tbl, sam, by = "Sample")
# If we are plotting ranks on the x-axis, we should make Rank a factor so we can order appropriately
tbl <- tbl %>%
    mutate(Rank = factor(Rank, rank_names(R2)))

(taxo.RS <- ggplot(tbl, aes(Rank, Richness)) +
    # ggbeeswarm::geom_quasirandom(color = "#39ab33") +    
  geom_boxplot() +
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
  xlab("Taxonomic rank") + ylab("ASV richness"))
ggsave(file.path(outp.fig,"taxo.RS.R2.svg"),taxo.RS, width = 5, height = 4, dpi = 300)
```

# Run 3 ITS 
```{r run3, eval=FALSE}
############################
#                          #
#      Track sequences     #
#                          #
############################

############################
# Import data 
############################

# Import all data from each step of the pipeline 
raw <- countFastq("MicroMic/R_code/Pipelines/raw_data/demultiplexed_reads_ITS_Run3", pattern = "fwd.fastq")
rownames(raw) <- gsub("_fwd.fastq","",rownames(raw)) 
raw$smp_names <- rownames(raw) 
out <- read.csv("MicroMic/R_code/Pipelines/output/Run3_ITS1/cutadapt_and_filtered/quality.csv", row.names = 1)
rownames(out) <- gsub("_rev.fastq","",rownames(out)) 
# Merge datasets
tabcounts <- merge(raw[,c("records","smp_names")], out, by = 0, all = T) 
rownames(tabcounts) <- tabcounts$smp_names 
tabcounts$Row.names <- NULL
tabcounts$smp_names <- NULL
colnames(tabcounts) <-  c("raw","cutadapt","filtered") 
  
pathR3 <- "MicroMic/R_code/Pipelines/output/Run3_ITS1/output/"
dadaFs <-  readRDS(file.path(pathR3,"dadaFs.RDS"))
dadaRs <- readRDS(file.path(pathR3,"dadaRs.RDS"))
mergers  <- readRDS(file.path(pathR3,"mergers.RDS"))
seqtab.nochim  <- readRDS(file.path(pathR3,"seqtab.no.chimera.RDS"))
collapse <- readRDS(file.path(pathR3,"seqtab.coll.RDS"))
dada2 <-  readRDS(file.path(pathR3,"ps.dada2.RDS"))
microbial <-  readRDS(file.path(pathR3,"ps.microbial.RDS"))
galan <- readRDS(file.path(pathR3,"ps.galan.RDS"))
decon <- readRDS(file.path(pathR3,"ps.decon.RDS"))
R3 <- readRDS(file.path(pathR3,"ps.vf.Run3.ITS.RDS"))

############################
# Percentage of reads lost 
############################

# Get number of reads from RDS objects throughout the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim), rowSums(collapse))
track <- merge(tabcounts,track,  by = 0, all = T)

trackps <- Reduce(function(x,y) merge(x = x, y = y, by ="names", all = T), 
       list(cbind(names = sample_names(dada2),dada2 = sample_sums(dada2)), 
            cbind(names = sample_names(microbial),microbial = sample_sums(microbial)), 
            cbind(names = sample_names(galan),galan = sample_sums(galan)), 
            cbind(names = sample_names(decon),decon = sample_sums(decon)),  
            cbind(names = sample_names(R3),vf = sample_sums(R3))))
trackall <- merge(track, trackps, by= 1)
colnames(trackall) <- c("Row.names","input","cutadapt","filtered", "denoisedF", "denoisedR", "merged", "nonchim","collapse",
                        "dada2","microbial","galan","decon","vf")
rownames(trackall) <- trackall$Row.names 
trackall$Row.names  <- NULL 
trackall <- trackall %>% mutate_if(is.character,as.numeric)
trackall[is.na(trackall)] <- 0
write.csv(trackall,file.path(outp.tab, "track.R3.csv"))

# Table with percentage of sequences lost 
track.tab <- data.frame(pipeline_step = c("cutadapt","DADA2 filter","No chimera", "Microbial", "Galan", "Decon","Vf"), 
  nb_reads = c((sum(trackall$input) - sum(trackall$cutadapt) ) / sum(trackall$input),
               (sum(trackall$cutadapt) - sum(trackall$filtered)) / sum(trackall$input), 
               (sum(trackall$filtered) - sum(trackall$nonchim) ) / sum(trackall$input),
               (sum(trackall$nonchim) - sum(trackall$microbial) ) / sum(trackall$input),
               (sum(trackall$microbial) - sum(trackall$galan) ) / sum(trackall$input),
               (sum(trackall$galan) - sum(trackall$decon) ) / sum(trackall$input),
               (sum(trackall$decon) - sum(trackall$vf) ) / sum(trackall$input)))
track.tab$percentage <- round(track.tab$nb_reads*100, 2)
sum(track.tab$percentage)
R3sum <- track.tab[,c("pipeline_step","percentage")]
write.csv(R3sum,file.path(outp.tab, "track.R3.percentage.csv"))  

# Plot with percentage of sequences lost 
trackall$smp_names <- rownames(trackall) 
tracktoplot <- trackall[,c("smp_names","input","cutadapt","filtered", "nonchim","microbial","galan","decon","vf")]
mlt.track <- melt(tracktoplot, id.vars = "smp_names")

# Format labels on the y axis
scientific_10 <- function(x) {
  parse(text=gsub("e", " %*% 10^", scales::scientific_format()(x)))
}

(track.seq.plot <- ggplot(mlt.track, aes(x = variable, y = value, color = variable, fill = variable)) + geom_violin(trim = FALSE) + 
  geom_line(aes(group = smp_names),colour = "grey", alpha = 0.3) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) + 
  scale_color_manual(values  =  pal) +
  scale_fill_manual(values  =  pal) +
  scale_x_discrete(name = "Pipeline steps") +
  scale_y_continuous(name = "Number of reads", label=scientific_10) )
ggsave(file.path(outp.fig,"track.seq.R3.svg"),track.seq.plot, width = 7, height = 4, dpi = 300)

############################
#                          #
#      Data structure      #
#                          #
############################

############################
# Number of reads in sample
############################

# Create a dataframe with all information
(R3sum_sup <- data.frame(
  Variable = c("Minimum reads","Mean reads","Median reads","Maximum reads","Total reads","Number of Samples", "Number of ASVs"),
  All.samples = c(min(sample_sums(R3)),mean(sample_sums(R3)),
                  median(sample_sums(R3)),max(sample_sums(R3)),sum(sample_sums(R3)),nsamples(R3),ntaxa(R3) ) ))

#Plot the distribution of sample sequencing depth
sample_sum_df <- data.frame(sum = sample_sums(R3))
ggplot(sample_sum_df, aes(x = sum)) +
     geom_histogram(color = "black", fill = "indianred", binwidth = 1000) +
     # scale_x_continuous(breaks = c(50, 10000, 20000, 30000, 40000, 50000))  + 
     ggtitle("Distribution of sample sequencing depth") + 
     xlab("Read counts") + 
     theme(axis.title.y = element_blank())

#Top 100 samples with few sequences 
sort(sample_sums(R3))[1:100]

#Plot number of sequences in each sample  
plot(sort(sample_sums(R3)))

############################
# Number of samples
############################

#Cross tabulation 
smp <- as(sample_data(R3), "data.frame")
df_tab_cross<-as.data.frame(table(smp$Tree,smp$Position_branch,smp$Date))

# Plot cross tabulation 
Nb.sample <- ggplot(df_tab_cross, aes(x=Var1, y=Var2)) + geom_point(aes(size=Freq, color=Freq)) +
   geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("") + 
   ggtitle("Number of samples in each tree")
ggsave(file.path(outp.fig,"Nb.sample.R3.svg"),Nb.sample, width = 5, height = 4, dpi = 300)

############################
# Number of taxa 
############################

# Table with percentage of ASVs lost during bioinformatic pipeline 
track.sp.R3 <- data.frame(pipeline_step = c("No chimera", "Microbial", "Galan", "Decon"), 
  nb_sp = c(ntaxa(dada2)-ntaxa(microbial),
            ntaxa(microbial)-ntaxa(galan),
            ntaxa(galan)-ntaxa(decon),
            ntaxa(decon)-ntaxa(R3)),
  percentage_sp = c((ntaxa(dada2)-ntaxa(microbial)) /ntaxa(dada2)*100,
            (ntaxa(microbial)-ntaxa(galan))/ntaxa(dada2)*100,
            (ntaxa(galan)-ntaxa(decon))/ntaxa(dada2)*100,
            (ntaxa(decon)-ntaxa(R3))/ntaxa(dada2)*100) ) 


# Plot number of taxa by rank  

# Script by Charlie Pauvert 
# Get a list of phyloseq objects, agglomerated to each taxonomic rank. Note, you should consider using or comparing to NArm = FALSE, especially if you have many taxa with unassigned lower ranks (e.g. Species). 
ps_list <- rank_names(R3) %>%
  map(~tax_glom(R3, ., NArm = TRUE))
names(ps_list) <- rank_names(R3)

# Get the richness for each sample at each rank. I'm just going to calculate this by hand.
richness <- ps_list %>%
    # convert otu_table to presence/absence
    map(transform_sample_counts, function(x) 1 * (x > 0)) %>%
    # Sample sum will now be the number of present taxa
    map(sample_sums)
richness
# Take the set of richness at each rank in the list richness and combine them into a data frame. Then join the sample data into this table.
tbl <- richness %>%
    map(enframe, "Sample", "Richness") %>%
    bind_rows(.id = "Rank")
sam <- sample_data(R3) %>% as_tibble(rownames = "Sample")
tbl <- left_join(tbl, sam, by = "Sample")
# If we are plotting ranks on the x-axis, we should make Rank a factor so we can order appropriately
tbl <- tbl %>%
    mutate(Rank = factor(Rank, rank_names(R3)))

(taxo.RS <- ggplot(tbl, aes(Rank, Richness)) +
    # ggbeeswarm::geom_quasirandom(color = "#39ab33") +    
  geom_boxplot() +
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
  xlab("Taxonomic rank") + ylab("ASV richness"))
ggsave(file.path(outp.fig,"taxo.RS.R3.svg"),taxo.RS, width = 5, height = 4, dpi = 300)
```

# Run 4 ITS 
```{r run4, eval=FALSE}
############################
#                          #
#      Track sequences     #
#                          #
############################

############################
# Import data 
############################

# Import all data from each step of the pipeline 
raw <- countFastq("MicroMic/R_code/Pipelines/raw_data/demultiplexed_reads_ITS_Run4", pattern = "fwd.fastq")
rownames(raw) <- gsub("_fwd.fastq","",rownames(raw)) 
raw$smp_names <- rownames(raw) 
out <- read.csv("MicroMic/R_code/Pipelines/output/Run4_ITS1/cutadapt_and_filtered/quality.csv", row.names = 1)
rownames(out) <- gsub("_rev.fastq","",rownames(out)) 
# Merge datasets
tabcounts <- merge(raw[,c("records","smp_names")], out, by = 0, all = T) 
rownames(tabcounts) <- tabcounts$smp_names 
tabcounts$Row.names <- NULL
tabcounts$smp_names <- NULL
colnames(tabcounts) <-  c("raw","cutadapt","filtered") 
  
pathR4 <- "MicroMic/R_code/Pipelines/output/Run4_ITS1/output/"
dadaFs <-  readRDS(file.path(pathR4,"dadaFs.RDS"))
dadaRs <- readRDS(file.path(pathR4,"dadaRs.RDS"))
mergers  <- readRDS(file.path(pathR4,"mergers.RDS"))
seqtab.nochim  <- readRDS(file.path(pathR4,"seqtab.no.chimera.RDS"))
collapse <- readRDS(file.path(pathR4,"seqtab.coll.RDS"))
dada2 <-  readRDS(file.path(pathR4,"ps.dada2.RDS"))
microbial <-  readRDS(file.path(pathR4,"ps.microbial.RDS"))
galan <- readRDS(file.path(pathR4,"ps.galan.RDS"))
decon <- readRDS(file.path(pathR4,"ps.decon.RDS"))
R4 <- readRDS(file.path(pathR4,"ps.vf.Run4.ITS.RDS"))
 
############################
# Percentage of reads lost 
############################

# Get number of reads from RDS objects throughout the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim), rowSums(collapse))
track <- merge(tabcounts,track,  by = 0, all = T)

trackps <- Reduce(function(x,y) merge(x = x, y = y, by ="names", all = T), 
       list(cbind(names = sample_names(dada2),dada2 = sample_sums(dada2)), 
            cbind(names = sample_names(microbial),microbial = sample_sums(microbial)), 
            cbind(names = sample_names(galan),galan = sample_sums(galan)), 
            cbind(names = sample_names(decon),decon = sample_sums(decon)),  
            cbind(names = sample_names(R4),vf = sample_sums(R4))))
trackall <- merge(track, trackps, by= 1)
colnames(trackall) <- c("Row.names","input","cutadapt","filtered", "denoisedF", "denoisedR", "merged", "nonchim","collapse",
                        "dada2","microbial","galan","decon","vf")
rownames(trackall) <- trackall$Row.names 
trackall$Row.names  <- NULL 
trackall <- trackall %>% mutate_if(is.character,as.numeric)
trackall[is.na(trackall)] <- 0
write.csv(trackall,file.path(outp.tab, "track.R4.csv"))

# Table with percentage of sequences lost 
track.tab <- data.frame(pipeline_step = c("cutadapt","DADA2 filter","No chimera", "Microbial", "Galan", "Decon","Vf"), 
  nb_reads = c((sum(trackall$input) - sum(trackall$cutadapt) ) / sum(trackall$input),
               (sum(trackall$cutadapt) - sum(trackall$filtered)) / sum(trackall$input), 
               (sum(trackall$filtered) - sum(trackall$nonchim) ) / sum(trackall$input),
               (sum(trackall$nonchim) - sum(trackall$microbial) ) / sum(trackall$input),
               (sum(trackall$microbial) - sum(trackall$galan) ) / sum(trackall$input),
               (sum(trackall$galan) - sum(trackall$decon) ) / sum(trackall$input),
               (sum(trackall$decon) - sum(trackall$vf) ) / sum(trackall$input)))
track.tab$percentage <- round(track.tab$nb_reads*100, 2)
sum(track.tab$percentage)
R4sum <- track.tab[,c("pipeline_step","percentage")]
write.csv(R4sum,file.path(outp.tab, "track.R4.percentage.csv"))  

# Plot with percentage of sequences lost 
trackall$smp_names <- rownames(trackall) 
tracktoplot <- trackall[,c("smp_names","input","cutadapt","filtered", "nonchim","microbial","galan","decon","vf")]
mlt.track <- melt(tracktoplot, id.vars = "smp_names")

# Format labels on the y axis
scientific_10 <- function(x) {
  parse(text=gsub("e", " %*% 10^", scales::scientific_format()(x)))
}

(track.seq.plot <- ggplot(mlt.track, aes(x = variable, y = value, color = variable, fill = variable)) + geom_violin(trim = FALSE) + 
  geom_line(aes(group = smp_names),colour = "grey", alpha = 0.3) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) + 
  scale_color_manual(values  =  pal) +
  scale_fill_manual(values  =  pal) +
  scale_x_discrete(name = "Pipeline steps") +
  scale_y_continuous(name = "Number of reads", label=scientific_10) )
ggsave(file.path(outp.fig,"track.seq.R4.svg"),track.seq.plot, width = 7, height = 4, dpi = 300)

############################
#                          #
#      Data structure      #
#                          #
############################

############################
# Number of reads in sample
############################

# Separate the different sets 
R4_set2 <- subset_samples_no_zero(R4, Sample_type =="Leaf_piece")
R4_set3 <- subset_samples_no_zero(R4, Sample_type =="Leaf_tissu")
R4_set4 <- subset_samples_no_zero(R4, Sample_type =="Litter")
R4_set5 <- subset_samples_no_zero(R4, Sample_type =="Source")

# Create a dataframe with all information
(R4sum_sup <- data.frame(
  Variable = c("Minimum reads","Mean reads","Median reads","Maximum reads","Total reads","Number of Samples", "Number of ASVs"),
  All.samples = c(min(sample_sums(R4)),mean(sample_sums(R4)),
                  median(sample_sums(R4)),max(sample_sums(R4)),sum(sample_sums(R4)),nsamples(R4),ntaxa(R4)),
  Set2 = c(min(sample_sums(R4_set2)),mean(sample_sums(R4_set2)),
                  median(sample_sums(R4_set2)),max(sample_sums(R4_set2)),sum(sample_sums(R4_set2)),nsamples(R4_set2),ntaxa(R4_set2)),
  Set3 = c(min(sample_sums(R4_set3)),mean(sample_sums(R4_set3)),
                  median(sample_sums(R4_set3)),max(sample_sums(R4_set3)),sum(sample_sums(R4_set3)),nsamples(R4_set3),ntaxa(R4_set3)),
  Set4 = c(min(sample_sums(R4_set4)),mean(sample_sums(R4_set4)),
                  median(sample_sums(R4_set4)),max(sample_sums(R4_set4)),sum(sample_sums(R4_set4)),nsamples(R4_set4),ntaxa(R4_set4)),
  Set5 = c(min(sample_sums(R4_set5)),mean(sample_sums(R4_set5)),
                  median(sample_sums(R4_set5)),max(sample_sums(R4_set5)),sum(sample_sums(R4_set5)),nsamples(R4_set5),ntaxa(R4_set5))))

#Plot the distribution of sample sequencing depth
sample_sum_df <- data.frame(sum = sample_sums(R4))
ggplot(sample_sum_df, aes(x = sum)) +
     geom_histogram(color = "black", fill = "indianred", binwidth = 1000) +
     # scale_x_continuous(breaks = c(50, 10000, 20000, 30000, 40000, 50000))  + 
     ggtitle("Distribution of sample sequencing depth") + 
     xlab("Read counts") + 
     theme(axis.title.y = element_blank())

#Top 100 samples with few sequences 
sort(sample_sums(R4))[1:100]

#Plot number of sequences in each sample  
plot(sort(sample_sums(R4)))

############################
# Number of samples
############################

#Cross tabulation 
smp <- as(sample_data(R4), "data.frame")

# Leaf tissue 
smp.ltissue <- subset(smp, Sample_type =="Leaf_tissu")
(df_tab_cross_ltissue <- as.data.frame(table(smp.ltissue$Tissue_type, smp.ltissue$Position)))
ggplot(df_tab_cross_ltissue, aes(x=Var2, y=Var1)) + geom_point(aes(size=Freq, color=Freq)) +
  geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   # facet_grid(.~Var2+Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("")

# Leaf piece 
smp.lpiece <- subset(smp, Sample_type =="Leaf_piece")
(df_tab_cross_lpiece <- as.data.frame(table(smp.lpiece$Tissue_type, smp.lpiece$Date ,smp.lpiece$Tree,smp.lpiece$Position )))
ggplot(df_tab_cross_lpiece, aes(x=Var4, y=Var1)) + geom_point(aes(size=Freq, color=Freq)) +
  geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var2+Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("") 

# Litter 
smp.litter <- subset(smp, Sample_type =="Litter")
(df_tab_cross_litter <- as.data.frame(table(smp.litter$Date, smp.litter$Tree, smp.litter$Position)))
ggplot(df_tab_cross_litter, aes(x=Var3, y=Var1)) + geom_point(aes(size=Freq, color=Freq)) +
  geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var2, scales = "free") +
   theme_bw() + xlab("") + ylab("") 

# Sources 
smp.source <- subset(smp, Sample_type =="Source")
(df_tab_cross_source <-as.data.frame(table(smp.source$Tissue_type)))

Nb.sample <- ggplot(df_tab_cross, aes(x=Var1, y=Var2)) + geom_point(aes(size=Freq, color=Freq)) +
   geom_text(aes(label = Freq), fontface="bold") +
  scale_size_area(max_size = 10)+
  scale_color_gradient(low="red", high="#55c243")+
   facet_grid(.~Var3, scales = "free") +
   theme_bw() + xlab("") + ylab("") + 
   ggtitle("Number of samples in each tree")
ggsave(file.path(outp.fig,"Nb.sample.R4.svg"),Nb.sample, width = 5, height = 4, dpi = 300)


############################
# Number of taxa 
############################

# Table with percentage of ASVs lost during bioinformatic pipeline 
track.sp.R4 <- data.frame(pipeline_step = c("No chimera", "Microbial", "Galan", "Decon"), 
  nb_sp = c(ntaxa(dada2)-ntaxa(microbial),
            ntaxa(microbial)-ntaxa(galan),
            ntaxa(galan)-ntaxa(decon),
            ntaxa(decon)-ntaxa(R4)),
  percentage_sp = c((ntaxa(dada2)-ntaxa(microbial)) /ntaxa(dada2)*100,
            (ntaxa(microbial)-ntaxa(galan))/ntaxa(dada2)*100,
            (ntaxa(galan)-ntaxa(decon))/ntaxa(dada2)*100,
            (ntaxa(decon)-ntaxa(R4))/ntaxa(dada2)*100) ) 


# Plot number of taxa by rank  

# Script by Charlie Pauvert 
# Get a list of phyloseq objects, agglomerated to each taxonomic rank. Note, you should consider using or comparing to NArm = FALSE, especially if you have many taxa with unassigned lower ranks (e.g. Species). 
ps_list <- rank_names(R4) %>%
  map(~tax_glom(R4, ., NArm = TRUE))
names(ps_list) <- rank_names(R4)

# Get the richness for each sample at each rank. I'm just going to calculate this by hand.
richness <- ps_list %>%
    # convert otu_table to presence/absence
    map(transform_sample_counts, function(x) 1 * (x > 0)) %>%
    # Sample sum will now be the number of present taxa
    map(sample_sums)
richness
# Take the set of richness at each rank in the list richness and combine them into a data frame. Then join the sample data into this table.
tbl <- richness %>%
    map(enframe, "Sample", "Richness") %>%
    bind_rows(.id = "Rank")
sam <- sample_data(R4) %>% as_tibble(rownames = "Sample")
tbl <- left_join(tbl, sam, by = "Sample")
# If we are plotting ranks on the x-axis, we should make Rank a factor so we can order appropriately
tbl <- tbl %>%
    mutate(Rank = factor(Rank, rank_names(R4)))

(taxo.RS <- ggplot(tbl, aes(Rank, Richness)) +
    # ggbeeswarm::geom_quasirandom(color = "#39ab33") +    
  geom_boxplot() +
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
  xlab("Taxonomic rank") + ylab("ASV richness"))
ggsave(file.path(outp.fig,"taxo.RS.R4.svg"),taxo.RS, width = 5, height = 4, dpi = 300)
```


Statistical analysis: Microbial composition and richness  
# Data structure 
```{r data_structure}
# Track the percentage of reads lost for each Run
read_lost <- cbind(R1sum, R2sum[,2], R3sum[,2], R4sum[,2])
total_read_lost <- c("Total",sum(R1sum$percentage),sum(R2sum$percentage),sum(R3sum$percentage),sum(R4sum$percentage))
read_lost <- rbind(read_lost, total_read_lost)
colnames(read_lost) <- c("Pipeline step","Run1","Run2","Run3","Run4")
write.csv(read_lost,file.path(outp.tab, "track.all.percentage.csv"))  

nb_reads <- cbind(R1sum_sup, R2sum_sup[,-1], R3sum_sup, R4sum_sup[,-1])
nb_reads <- as.data.frame(t(nb_reads), stringsAsFactors = FALSE)
nb_reads <- nb_reads[c(-1,-8),]
nb_reads[c("V1","V2","V3","V4","V5","V6","V7")] <- sapply(nb_reads[c("V1","V2","V3","V4","V5","V6","V7")] ,as.numeric)
nb_reads$Run <- c("Run1",rep("Run2",5),"Run3",rep("Run4",5))
nb_reads$Sets <- c(rep(c("Set#1","All sets","Set#2","Set#3","Set#4","Set#5")))
colnames(nb_reads) <- c("Minimum reads","Mean reads","Median reads","Maximum reads","Total reads","Number of Samples","Number of ASVs",
                        "Run","Set")
write.csv(nb_reads,file.path(outp.tab, "nb_reads.csv")) 
```

# Taxonomic composition 
```{r taxonomic_composition}

#---------------------------# 
# Collapse ASV between runs #
#---------------------------#
rename_phyloseq_from_FASTA <-function(physeq, corresponding, differing_length = FALSE,...){
  corresp<-corresponding[,2]
  names(corresp)<-corresponding[,1]
  # first column : the initial OTU name
  # second column : the actual OTU name
  # Check that all current names in phyloseq object
  #  matches the name in the file
  if( ! all(taxa_names(physeq) %in% names(corresp) ) ){
    stop("Not all taxa names in physeq object matches the names in the corresponding file.")
  }
  # Check the number of items
  if( length(corresponding) != ntaxa(physeq) & differing_length ){
    stop("Differing length between the corresponding file and the number of taxa.\n If you know what you are doing, use differing_length = TRUE")
  }
  taxa_names(physeq)<-unname(corresp[ taxa_names(physeq) ])
  return(physeq)
}


#        Bacteria        #
#------------------------#

# Merge with taxonomy table from the phyloseq object 
library(Biostrings)

# Rename ASV names to sequences -Run 1 
Seq.R1 <- readDNAStringSet("MicroMic/R_code/Pipelines/output/Run1_16S/output/ASV.seq.16S1.fasta")
ASVnames1 = names(Seq.R1)
Sequences1 = paste(Seq.R1)
dfseqR1 <- data.frame(ASVnames1, Sequences1)
R1 <- readRDS("MicroMic/R_code/Pipelines/output/Run1_16S/output/ps.vf.Run1.16S.RDS")
dfseqR1 <-dfseqR1[(dfseqR1$ASVnames1 %in% rownames(as.matrix(tax_table(R1)))),]
ps.run1seq <- R1
ps.run1seq <- rename_phyloseq_from_FASTA(ps.run1seq, dfseqR1)

# Rename ASV names to sequences -Run 2 
Seq.R2 <- readDNAStringSet("MicroMic/R_code/Pipelines/output/Run2_16S/output/ASV.seq.16S2.fasta")
ASVnames2 = names(Seq.R2)
Sequences2 = paste(Seq.R2)
dfseqR2 <- data.frame(ASVnames2, Sequences2)
R2 <- readRDS("MicroMic/R_code/Pipelines/output/Run2_16S/output/ps.vf.Run2.16S.RDS")
dfseqR2 <-dfseqR2[(dfseqR2$ASVnames2 %in% rownames(as.matrix(tax_table(R2)))),]
ps.run2seq <- R2
ps.run2seq <- rename_phyloseq_from_FASTA(ps.run2seq, dfseqR2)

psRun1and2 <- merge_phyloseq(ps.run1seq, ps.run2seq)
sample_data(psRun1and2)[,"Sample_type"][is.na(sample_data(psRun1and2)[,"Sample_type"])] <- "Leaf"
sample_data(psRun1and2)$Sample_type <- as.factor(sample_data(psRun1and2)$Sample_type)
sample_data(psRun1and2)[,"Tissue_type"][is.na(sample_data(psRun1and2)[,"Tissue_type"])] <- "Leaf"
sample_data(psRun1and2)$Tissue_type <- as.factor(sample_data(psRun1and2)$Tissue_type)


#        Fungi           #
#------------------------#

# Merge with taxonomy table from the phyloseq object 


# Rename ASV names to sequences -Run 1 
Seq.R3 <- readDNAStringSet("MicroMic/R_code/Pipelines/output/Run3_ITS1/output/ASV.seq.ITS3.fasta")
ASVnames3 = names(Seq.R3)
Sequences3 = paste(Seq.R3)
dfseqR3 <- data.frame(ASVnames3, Sequences3)
R3 <- readRDS("MicroMic/R_code/Pipelines/output/Run3_ITS1/output/ps.vf.Run3.ITS.RDS")
dfseqR3 <-dfseqR3[(dfseqR3$ASVnames3 %in% rownames(as.matrix(tax_table(R3)))),]
ps.run3seq <- R3
ps.run3seq <- rename_phyloseq_from_FASTA(ps.run3seq, dfseqR3)

# Rename ASV names to sequences -Run 4 
Seq.R4 <- readDNAStringSet("MicroMic/R_code/Pipelines/output/Run4_ITS1/output/ASVID.ITS4.seq.fasta")
ASVnames4 = names(Seq.R4)
Sequences4 = paste(Seq.R4)
dfseqR4 <- data.frame(ASVnames4, Sequences4)
R4 <- readRDS("MicroMic/R_code/Pipelines/output/Run4_ITS1/output/ps.vf.Run4.ITS.RDS")
dfseqR4 <-dfseqR4[(dfseqR4$ASVnames4 %in% rownames(as.matrix(tax_table(R4)))),]
ps.run4seq <- R4
ps.run4seq <- rename_phyloseq_from_FASTA(ps.run4seq, dfseqR4)

psRun3and4 <- merge_phyloseq(ps.run3seq, ps.run4seq)
sample_data(psRun3and4)[,"Sample_type"][is.na(sample_data(psRun3and4)[,"Sample_type"])] <- "Leaf"
sample_data(psRun3and4)$Sample_type <- as.factor(sample_data(psRun3and4)$Sample_type)
sample_data(psRun3and4)[,"Tissue_type"][is.na(sample_data(psRun3and4)[,"Tissue_type"])] <- "Leaf"
sample_data(psRun3and4)$Tissue_type <- as.factor(sample_data(psRun3and4)$Tissue_type)

sum_all <- data.frame(
  Variable = c("Minimum reads","Mean reads","Median reads","Maximum reads","Total reads","Number of Samples", "Number of ASVs"),
  Run1_2 = c(min(sample_sums(psRun1and2)),mean(sample_sums(psRun1and2)),
                  median(sample_sums(psRun1and2)),max(sample_sums(psRun1and2)),sum(sample_sums(psRun1and2)),
                  nsamples(psRun1and2),ntaxa(psRun1and2)),
  Run3_4 = c(min(sample_sums(psRun3and4)),mean(sample_sums(psRun3and4)),
                  median(sample_sums(psRun3and4)),max(sample_sums(psRun3and4)),sum(sample_sums(psRun3and4)),
           nsamples(psRun3and4),ntaxa(psRun3and4)))
1- (sum(grepl("Unknown", tax_table(ps.run1seq)[, "Genus"])) / ntaxa(ps.run1seq))
1- (sum(grepl("Unknown", tax_table(psRun3and4)[, "Genus"])) / ntaxa(psRun3and4))


#---------------------------# 
#      Plot composition     #
#---------------------------#

#        Bacteria        #
#------------------------#
library(plyr)

# Transform ASV abundance to relative abundance
ps.RA12 <- transform_sample_counts(psRun1and2,function(x) x / sum(x))  
melt.RA12 <-psmelt(ps.RA12)  # Melt phyloseq obejct 
melt.RA12 <-droplevels(melt.RA12[melt.RA12$Abundance > 0 ,])

# Phylum proportion by sample type 
phyl.prop <- ddply(melt.RA12 , .(Sample_type, Phylum), summarise, count = sum(Abundance))
phyl.prop <- ddply(phyl.prop, " Sample_type", mutate, prop = count/sum(count))
phyl.prop$prop_cl <- round(phyl.prop$prop*100,2)
phyl.prop.st <- subset(phyl.prop, prop_cl >1)
# phyl.prop[order(phyl.prop$prop_cl,phyl.prop$Sample_type),]

# Phylum proportion by tissue type 
phyl.prop <- ddply(melt.RA12 , .(Sample_type, Tissue_type, Phylum), summarise, count = sum(Abundance))
phyl.prop <- ddply(phyl.prop, c("Sample_type", "Tissue_type"), mutate, prop = count/sum(count))
phyl.prop$prop_cl <- round(phyl.prop$prop*100,2)
phyl.prop.tt <- subset(phyl.prop, prop_cl >1)
# phyl.prop[order(phyl.prop$prop_cl,phyl.prop$Sample_type),]

# Class proportion by sample type 
class.prop <- ddply(melt.RA12 , .(Sample_type,Tissue_type, Phylum, Class), summarise, count = sum(Abundance)) # Sum ASV 
class.prop <-ddply(class.prop, c( "Tissue_type"), mutate, prop = count/sum(count)) # Get propotion by acorn type
class.prop <- ddply(class.prop, c("Sample_type","Tissue_type", "Phylum" ,"Class"), summarise, sum.prop = sum(prop)) # Sum the proportion 
class.prop$Class <- as.character(class.prop$Class)
class.prop$Class_clean <- with(class.prop, ifelse(
  sum.prop < 0.01 , 'Other',class.prop$Class))
class.prop$Class_clean <- as.factor(class.prop$Class_clean) 
dput(sort(unique(class.prop$Class_clean)))

class.prop$Class_clean <- factor(class.prop$Class_clean, 
                                 levels = c("Acidimicrobiia", "Acidobacteriae", "Actinobacteria", "Alphaproteobacteria", "Bacilli",
                                            "Bacteroidia", "Fimbriimonadia", "Gammaproteobacteria", "Myxococcia", "Thermoleophilia",
                                            "Other", "Unknown"))
order_class <- levels(class.prop$Class_clean)
class.prop$Tissue_type <- factor(class.prop$Tissue_type, levels = c("Leaf","E","I","N","End", "Ep",
                                                                    "Ram","Lie", "Bry",  "Bm","Lit", "Fra", "Riv"))
cols.b <- c(wes_palette("FantasticFox1", 10, type = "continuous"),"#c0bfc4","#6f726f") 

# Plot microbial composition in each sets
(Fig2b <- ggplot(class.prop, aes(x= Tissue_type, y=sum.prop, fill = Class_clean))+
  geom_bar(aes(fill=Class_clean), stat="identity", position = "fill")+ 
  scale_fill_manual(values = cols.b,breaks = order_class, name = "Microbial classes") + 
  theme_bw() +
  theme(axis.text=element_text(size=15,colour = "black"), 
        axis.title = element_text(size=20, colour = "black"),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.ticks =element_blank(),
        panel.border = element_blank()) + 
    scale_x_discrete(breaks = c("Leaf","E","I","N","End", "Ep", "Ram","Lie", "Bry",  "Bm","Lit", "Fra", "Riv"),
                     labels = c("Entire\n\ leaf","Leaf\n\ edge","Leaf\n\ center","Leaf\n\ blade","Endophytes","Epiphyte",
                                 "Beech\n\ twigs","Common\n\ Ivy","Bryophyte","Dead\n\ wood","Litter","Butcher's\n\ broom",
                                 "River\n\ bedding"),
                     name = "") +
    scale_y_continuous(name = "Relative abundance of ASVs", labels = scales::percent) )


#        Fungi           #
#------------------------#
library(plyr)

# Transform ASV abundance to relative abundance
ps.RA34 <- transform_sample_counts(psRun3and4,function(x) x / sum(x))  
melt.RA34 <-psmelt(ps.RA34)  # Melt phyloseq obejct 
melt.RA34 <-droplevels(melt.RA34[melt.RA34$Abundance > 0 ,])

# Phylum proportion by sample type 
phyl.prop <- ddply(melt.RA34 , .(Sample_type, Phylum), summarise, count = sum(Abundance))
phyl.prop <- ddply(phyl.prop, " Sample_type", mutate, prop = count/sum(count))
phyl.prop$prop_cl <- round(phyl.prop$prop*100,2)
phyl.prop.st <- subset(phyl.prop, prop_cl >1)
# phyl.prop[order(phyl.prop$prop_cl,phyl.prop$Sample_type),]

# Phylum proportion by tissue type 
phyl.prop <- ddply(melt.RA34 , .(Sample_type, Tissue_type, Phylum), summarise, count = sum(Abundance))
phyl.prop <- ddply(phyl.prop, c("Sample_type", "Tissue_type"), mutate, prop = count/sum(count))
phyl.prop$prop_cl <- round(phyl.prop$prop*100,2)
phyl.prop.tt <- subset(phyl.prop, prop_cl >1)
# phyl.prop[order(phyl.prop$prop_cl,phyl.prop$Sample_type),]

# Class proportion by sample type 
class.prop <- ddply(melt.RA34 , .(Sample_type,Tissue_type, Phylum, Class), summarise, count = sum(Abundance)) # Sum ASV 
class.prop <-ddply(class.prop, c( "Tissue_type"), mutate, prop = count/sum(count)) # Get propotion by acorn type
class.prop <- ddply(class.prop, c("Sample_type","Tissue_type", "Phylum" ,"Class"), summarise, sum.prop = sum(prop)) # Sum the proportion 
class.prop$Class <- as.character(class.prop$Class)
class.prop$Class_clean <- with(class.prop, ifelse(
  sum.prop < 0.01 , 'Other',class.prop$Class))
class.prop$Class_clean <- as.factor(class.prop$Class_clean) 
dput(sort(unique(class.prop$Class_clean)))

class.prop$Class_clean <- factor(class.prop$Class_clean, 
                                 levels = c("Agaricomycetes", "Agaricostilbomycetes","Cystobasidiomycetes", "Dothideomycetes",
                                            "Eurotiomycetes", "Lecanoromycetes", "Leotiomycetes", "Microbotryomycetes", "Orbiliomycetes",
                                            "Sordariomycetes", "Spizellomycetes", "Taphrinomycetes", "Tremellomycetes", 
                                            "Other","Unknown"))
order_class <- levels(class.prop$Class_clean)
class.prop$Tissue_type <- factor(class.prop$Tissue_type, levels = c("Leaf","E","I","N","End", "Ep",
                                                                    "Ram","Lie", "Bry",  "Bm","Lit", "Fra", "Riv"))
cols.f <- c(wes_palette("Zissou1", 13, type = "continuous"),"#c0bfc4","#6f726f") 

# Plot microbial composition in each sets
(Fig2f <- ggplot(class.prop, aes(x= Tissue_type, y=sum.prop, fill = Class_clean))+
  geom_bar(aes(fill=Class_clean), stat="identity", position = "fill")+ 
  scale_fill_manual(values = cols.f,breaks = order_class, name = "Microbial classes") + 
  theme_bw() +
  theme(axis.text=element_text(size=15,colour = "black"), 
        axis.title = element_text(size=20, colour = "black"),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.ticks =element_blank(),
        panel.border = element_blank()) + 
    scale_x_discrete(breaks = c("Leaf","E","I","N","End", "Ep", "Ram","Lie", "Bry",  "Bm","Lit", "Fra", "Riv"),
                     labels = c("Entire\n\ leaf","Leaf\n\ edge","Leaf\n\ center","Leaf\n\ blade","Endophytes","Epiphyte",
                                 "Beech\n\ twigs","Common\n\ Ivy","Bryophyte","Dead\n\ wood","Litter","Butcher's\n\ broom",
                                 "River\n\ bedding"),
                     name = "") +
    scale_y_continuous(name = "Relative abundance of ASVs", labels = scales::percent) )

library(cowplot)
lb <-get_legend(Fig2b) #Get legend 
lf <-get_legend(Fig2f) #Get legend 
(Fig2 <- plot_grid(Fig2b, Fig2f,nrow = 2, labels = c("A","B")))
ggsave(file.path(outp.fig,"Fig2.svg"),Fig2, width = 13, height = 10, dpi = 300)
```

# Microbial structure
## Set#1 
```{r microbial_structure_Set1}
library(vegan)
cols.height <- c("EB" = "#9e6200", "IB"= "#9e6200", "EM" = "#bb0000","IM" = "#bb0000" ,"EH" = "#f3cb01", "IH" = "#f3cb01")
shapes <- c("H727.Ext" = 16, "H760.Ext"= 17, "H790.Ext" = 15,
            "H727.Int" = 1 ,"H760.Int" = 2, "H790.Int" = 0)

#Bacterial community composition : PCoA and PERMANOVA 
sample_data(R1)$shapetoplot <- interaction(sample_data(R1)$Tree, sample_data(R1)$Level)
dJ.R1 <- phyloseq::distance(R1, method = "jaccard", binary = T)
metadata <- as(sample_data(R1), "data.frame") 
(aovb <- adonis(dJ.R1 ~ Height * Level * Date * Tree ,  data = metadata, perm = 999))

# Plot PCoA
PCoA <- ordinate(R1, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PCA.canopy.b <- plot_ordination(R1, PCoA, color = "Position_branch",shape = "shapetoplot", axes = c(1,2)) +
  geom_point(aes(size =Date), stroke = 2) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=25),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
   scale_color_manual(values = cols.height )  +
   scale_size_manual (values= c(3,5,7) )  +
   scale_shape_manual(values = shapes))
PCA.canopy.b$layers <- PCA.canopy.b$layers[-1]
PCA.canopy.b


#Fungal community composition : PCoA and PERMANOVA 
sample_data(R3)$shapetoplot <- interaction(sample_data(R3)$Tree, sample_data(R3)$Level)
dJ.R3 <- phyloseq::distance(R3, method = "jaccard", binary = T)
metadata <- as(sample_data(R3), "data.frame") 
(aovf <- adonis(dJ.R3 ~ Height * Level * Date * Tree ,  data = metadata, perm = 999))

# Plot PCoA
PCoA <- ordinate(R3, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PCA.canopy.f <- plot_ordination(R3, PCoA, color = "Position_branch",shape = "shapetoplot", axes = c(1,2)) +
  geom_point(aes(size =Date), stroke = 2) + 
  stat_ellipse(type = "norm", linetype = 2, aes(group=Tree)) +
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=25),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
   scale_color_manual(values = cols.height )  +
   scale_size_manual (values= c(3,5,7) )  +
   scale_shape_manual(values = shapes))
PCA.canopy.f$layers <- PCA.canopy.f$layers[-1]
PCA.canopy.f


# Pool PERMANOVA results 
tabS1b <- as.data.frame(aovb$aov)
tabS1f <- as.data.frame(aovf$aov)
tabS1 <- rbind(tab3b,tab3f)
write.csv(tabS1,file.path(outp.tab, "tab3r.csv")) 

# Pool PCA graphs 
(PCA <- cowplot::plot_grid(PCA.canopy.b + theme(legend.position = "none"),
                   PCA.canopy.f + theme(legend.position = "none"),
                   nrow = 1))
ggsave(file.path(outp.fig,"PCoA.canopy.svg"),PCA, width =12, height = 6, dpi = 300)
```

## Set#2 
```{r microbial_structure_Set2}

library(vegan)
cols.tissue <- c("N" = "#006000", "I"= "#71b134", "E" = "#eae13b")

#------------------------# 
#        Bacteria        #
#------------------------#

#Bacterial community composition : PCoA and PERMANOVA 
Set2 <- subset_samples_no_zero(R2, Sample_type =="Leaf_piece")
sample_data(Set2)$toplot <- interaction(sample_data(Set2)$Position, sample_data(Set2)$Tissue_type)
sample_data(Set2)$strata <- interaction(sample_data(Set2)$Tree, sample_data(Set2)$Date)
dJ.Set2 <- phyloseq::distance(Set2, method = "jaccard", binary = T)
metadata <- as(sample_data(Set2), "data.frame") 
(aovbl <- adonis(dJ.Set2 ~  Tissue_type * Position ,  data = metadata, perm = 999, strata = metadata$strata))

# Plot PCoA
PCoA <- ordinate(Set2, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PCA.lp.b <- plot_ordination(Set2, PCoA, color = "Tissue_type", shape= "Position",axes = c(1,2)) +
  geom_point(size = 4.5) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
    scale_color_manual(values = cols.tissue)
  )

#------------------------# 
#        Fungi           #
#------------------------#

# PERMANOVA on the Matrix built using jaccard  distance 
Set2 <- subset_samples_no_zero(R4, Sample_type =="Leaf_piece")
sample_data(Set2)$toplot <- interaction(sample_data(Set2)$Position, sample_data(Set2)$Tissue_type)
sample_data(Set2)$strata <- interaction(sample_data(Set2)$Tree, sample_data(Set2)$Date)
# sample_data(Set2)$seg <- substr(rownames(sample_data(Set2)), 1, 11)

dJ.Set2 <- phyloseq::distance(Set2, method = "jaccard", binary = T)
metadata <- as(sample_data(Set2), "data.frame") 
(aovfl <- adonis(dJ.Set2 ~  Tissue_type * Position ,  data = metadata, perm = 999, strata = metadata$strata))

# Plot PCoA
PCoA <- ordinate(Set2, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PCA.lp.f <- plot_ordination(Set2, PCoA, color = "Tissue_type", shape= "Position",axes = c(1,2)) +
  # stat_ellipse(type = "norm", linetype = 2, aes(group=seg)) +
  geom_point(size = 4.5) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
    scale_color_manual(values = cols.tissue)
  )

(PCoA <- cowplot::plot_grid(PCA.lp.b + theme(legend.position = "none"),
                   PCA.lp.f + theme(legend.position = "none"),
                   ncol = 2))
ggsave(file.path(outp.fig,"PCoA.lp.svg"),PCoA, width = 12, height = 6, dpi = 300)

# Pool PERMANOVA results 
tabS2b <- as.data.frame(aovbl$aov)
tabS2f <- as.data.frame(aovfl$aov)
tabS2LVB <- rbind(tabS2b,tabS2f)
# write.csv(tab4,file.path(outp.tab, "tabS2r.csv")) 
```

# Set#3 
```{r microbial_structure_Set3}

cols.height <- c("EB" = "#9e6200", "IB"= "#c8983e", "EM" = "#a52e01","IM" = "#d43b0e" ,"EH" = "#f3cb01", "IH" = "#f2ea8a")

#------------------------# 
#        Bacteria        #
#------------------------#

# PERMANOVA on the Matrix built using jaccard  distance 
Set3 <- subset_samples_no_zero(R2, Sample_type =="Leaf_tissu")
dJ.Set3 <- phyloseq::distance(Set3, method = "jaccard", binary = T)
metadata <- as(sample_data(Set3), "data.frame") 
(aovbt <- adonis(dJ.Set3 ~  Tissue_type * Position ,  data = metadata, perm = 999))

PCoA <- ordinate(Set3, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PCA.t.b <- plot_ordination(Set3, PCoA, color = "Position", shape= "Tissue_type",axes = c(1,2)) +
  geom_point(size=4.5, stroke = 1.1) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
   scale_color_manual(values =  cols.height) )


#------------------------# 
#        Fungi           #
#------------------------#

# PERMANOVA on the Matrix built using jaccard  distance 
Set3 <- subset_samples_no_zero(R4, Sample_type =="Leaf_tissu")
dJ.Set3 <- phyloseq::distance(Set3, method = "jaccard", binary = T)
metadata <- as(sample_data(Set3), "data.frame") 
(aovft <- adonis(dJ.Set3 ~  Tissue_type * Position ,  data = metadata, perm = 999))

PCoA <- ordinate(Set3, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PCA.t.f <- plot_ordination(Set3, PCoA, color = "Position", shape= "Tissue_type",axes = c(1,2)) +
  geom_point(size=4.5, stroke = 1.1) + 
  theme_bw() +
  theme(plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=15),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
   scale_color_manual(values =  cols.height) )

(PCoA <- cowplot::plot_grid(PCA.t.b + theme(legend.position = "none"),
                   PCA.t.f + theme(legend.position = "none"),
                   ncol = 2))
ggsave(file.path(outp.fig,"PCoA.lt.svg"),PCoA, width = 12, height = 6, dpi = 300)

# Pool PERMANOVA results 
tabS2b <- as.data.frame(aovbt$aov)
tabS2f <- as.data.frame(aovft$aov)
tabS2LST <- rbind(tabS2b,tabS2f)

tabS24 <- rbind(tabS2LVB,tabS24LST)
write.csv(tab4,file.path(outp.tab, "tabS2r.csv")) 
```

# Set#4
```{r microbial_structure_Set4}

shapes <- c("EH" = 16, "IB" = 1)

#------------------------# 
#        Bacteria        #
#------------------------#

# PERMANOVA on the Matrix built using jaccard  distance 
Set4 <- subset_samples_no_zero(R2, Sample_type =="Litter")
dJ.Set4 <- phyloseq::distance(Set4, method = "jaccard", binary = T)
metadata <- as(sample_data(Set4), "data.frame") 
(aovblit <- adonis(dJ.Set4 ~  Date * Position ,  data = metadata, perm = 999, strata = metadata$Tree))

PCoA <- ordinate(Set4, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PcoAblit <- plot_ordination(Set4, PCoA, color = "Date", shape= "Position",axes = c(1,2)) +
  geom_point(size=7, stroke = 2) + 
  theme_bw() +
  theme(panel.grid = element_blank(),
        plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=30),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
  scale_color_manual(values =  c(J1 = "#ccd04a", J15 ="#5fa54b", J30 ="#d35238", J60 = "#b77544")) +
  scale_shape_manual(values = shapes))
PcoAblit$layers <- PcoAblit$layers[-1]
PcoAblit

#------------------------# 
#        Fungi           #
#------------------------#

# PERMANOVA on the Matrix built using jaccard  distance 
Set4 <- subset_samples_no_zero(R4, Sample_type =="Litter")
dJ.Set4 <- phyloseq::distance(Set4, method = "jaccard", binary = T)
metadata <- as(sample_data(Set4), "data.frame") 
(aovflit <- adonis(dJ.Set4 ~  Date * Position ,  data = metadata, perm = 999, strata = metadata$Tree))

PCoA <- ordinate(Set4, method = "PCoA", distance = "jaccard", binary = T, try=100,set.seed(31584))
(PcoAflit <- plot_ordination(Set4, PCoA, color = "Date", shape= "Position",axes = c(1,2)) +
  geom_point(size=7, stroke = 2) + 
  theme_bw() +
  theme(panel.grid = element_blank(),
        plot.title = element_text(size=15), 
        axis.text=element_text(size=15), 
        axis.title = element_text(size=30),
        legend.text = element_text(size=15), 
        legend.title = element_text(size=15),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.title.x = element_text(colour = "black"),
        axis.title.y = element_text(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA)) +
  scale_color_manual(values =  c(J1 = "#ccd04a", J15 ="#5fa54b", J30 ="#d35238", J60 = "#b77544")) +
  scale_shape_manual(values = shapes))
PcoAflit$layers <- PcoAflit$layers[-1]
PcoAflit

(PCoA <- cowplot::plot_grid(PcoAblit + theme(legend.position = "none"),
                   PcoAflit + theme(legend.position = "none"),
                   nrow = 1))
ggsave(file.path(outp.fig,"PCoA.litter.svg"),PCoA, width =12, height = 6, dpi = 300)


# Pool PERMANOVA results 
tabS3b <- as.data.frame(aovblit$aov)
tabS3f <- as.data.frame(aovflit$aov)
tabS3 <- rbind(tabS3b,tabS3f)
write.csv(tab5,file.path(outp.tab, "tabS3r.csv")) 

```

